Loading data and independent filtering

The first two columns of the datafile should be annotation (for example probe ID and gene Symbol). Numerical data start from column 3

#graphical parameters
source("setPowerPointStyle.R")
setPowerPointStyle()

data_file = "TNF_IFN_2.csv"
my_data = read.csv(data_file,sep = '\t')
head(my_data)
##   probe  gene     CTRL   CTRL.1    CTRL.2  IFN.2.5 IFN.2.5.1 IFN.2.5.2
## 1  RFC2  RFC2 8.324769 8.324769  8.132764 8.375789  8.373458  8.324769
## 2 HSPA6 HSPA6 5.413300 4.830096  5.929786 8.329249  8.006876  7.950832
## 3  PAX8  PAX8 5.594199 5.581641  5.581641 5.661038  5.581641  5.581641
## 4  UBA7  UBA7 7.361664 7.307654  7.336399 7.336399  7.217897  7.197828
## 5  CCL5  CCL5 9.985708 8.999638 10.371103 4.692704  4.517400  4.996733
## 6 ESRRA ESRRA 6.335024 6.373706  6.539027 6.630504  6.847058  6.778308
##     TNF.2.5 TNF.2.5.1 TNF.2.5.2 TNF.IFN.2.5 TNF.IFN.2.5.1 TNF.IFN.2.5.2
## 1  8.324769  8.462647  8.386247    8.324769      8.338911      8.192397
## 2  6.248139  6.207699  5.940508    7.677756      7.484339      7.540568
## 3  5.581641  5.581641  5.581641    5.504055      5.411538      5.545741
## 4  7.671212  7.607462  7.470790    7.420842      7.336399      7.095814
## 5 12.476331 12.450789 12.326590    6.663396      5.642317      6.052862
## 6  6.155354  6.370346  6.397341    6.975653      6.886920      6.539027
#get numeric data
expression_data = my_data[,-(1:2)]
#log-transform if needed

if (max(expression_data)>25) expression_data = log2(expression_data)

#we assume the same number of samples for each condition
samples = ncol(expression_data)/4

design = c(rep("0", samples), rep("X", samples),
           rep("Y", samples), rep("Y+X", samples))


#removing uninformative probes (very small coefficient of variation)
cof_cutoff = 0.05

cof = apply(expression_data, 1, function(x) sd(x)/mean(x))

cof_filter = which(cof > cof_cutoff)

PCA of samples

my.pca <- prcomp(t(expression_data[cof_filter, ]), center = TRUE, scale=TRUE)

cols = c(rep("black", samples), rep("red", samples),
                rep("blue", samples), rep("yellow", samples))

plot(my.pca$x[,1], my.pca$x[,2], col = cols,
     xlab = "PC1", ylab = "PC2", pch = 20, cex = 1.5, main = data_file)

legend("bottomleft", pch = 20, col = unique(cols), 
       legend = c("0","X","Y","X+Y"), bty = 'n',cex = 1)

Filtering based on minimum group differences

source("filter_data_on_deltas.R")

my_data_filtered = filter_data_on_deltas(my_data)
 
head(my_data_filtered)
##      probe    gene     CTRL   CTRL.1    CTRL.2  IFN.2.5 IFN.2.5.1
## 2    HSPA6   HSPA6 5.413300 4.830096  5.929786 8.329249  8.006876
## 5     CCL5    CCL5 9.985708 8.999638 10.371103 4.692704  4.517400
## 10     PXK     PXK 9.294279 9.508841  9.240162 9.846890 10.014370
## 11 MSANTD3 MSANTD3 9.590901 9.274737  9.831871 7.027361  7.053774
## 12 SLC46A1 SLC46A1 6.048298 6.617517  6.384521 7.951326  7.867392
## 14    PIGX    PIGX 9.020155 8.955406  9.036642 8.261126  8.799131
##    IFN.2.5.2   TNF.2.5 TNF.2.5.1 TNF.2.5.2 TNF.IFN.2.5 TNF.IFN.2.5.1
## 2   7.950832  6.248139  6.207699  5.940508    7.677756      7.484339
## 5   4.996733 12.476331 12.450789 12.326590    6.663396      5.642317
## 10 10.093442  8.939042  9.193210  8.997016    9.972881     10.101698
## 11  7.320024  8.754430  8.966617  8.826732    7.349552      7.153825
## 12  7.562287  6.122274  6.428453  6.443509    8.016323      7.685079
## 14  8.650005  8.832036  8.905883  8.848334    9.028034      9.129482
##    TNF.IFN.2.5.2
## 2       7.540568
## 5       6.052862
## 10     10.142173
## 11      7.248851
## 12      7.787337
## 14      8.781471

Random forest classifier

source("find_optimal_match.R")

my_data_filtered_matched = find_optimal_match(my_data_filtered)

head(my_data_filtered_matched)
##      probe    gene     CTRL   CTRL.1    CTRL.2  IFN.2.5 IFN.2.5.1
## 2    HSPA6   HSPA6 5.413300 4.830096  5.929786 8.329249  8.006876
## 5     CCL5    CCL5 9.985708 8.999638 10.371103 4.692704  4.517400
## 10     PXK     PXK 9.294279 9.508841  9.240162 9.846890 10.014370
## 11 MSANTD3 MSANTD3 9.590901 9.274737  9.831871 7.027361  7.053774
## 12 SLC46A1 SLC46A1 6.048298 6.617517  6.384521 7.951326  7.867392
## 14    PIGX    PIGX 9.020155 8.955406  9.036642 8.261126  8.799131
##    IFN.2.5.2   TNF.2.5 TNF.2.5.1 TNF.2.5.2 TNF.IFN.2.5 TNF.IFN.2.5.1
## 2   7.950832  6.248139  6.207699  5.940508    7.677756      7.484339
## 5   4.996733 12.476331 12.450789 12.326590    6.663396      5.642317
## 10 10.093442  8.939042  9.193210  8.997016    9.972881     10.101698
## 11  7.320024  8.754430  8.966617  8.826732    7.349552      7.153825
## 12  7.562287  6.122274  6.428453  6.443509    8.016323      7.685079
## 14  8.650005  8.832036  8.905883  8.848334    9.028034      9.129482
##    TNF.IFN.2.5.2 score prof_index
## 2       7.540568 0.290         52
## 5       6.052862 0.536        108
## 10     10.142173 0.426         86
## 11      7.248851 0.500         68
## 12      7.787337 0.660         21
## 14      8.781471 0.598          2
# library(caret)
# library(randomForest)
# source("match11.R")
# source("setPowerPointStyle.R")
# setPowerPointStyle()
#  
# load("rf_model")
#  
# profile_features = apply(my_data_filtered[, -(1:2)] + replicate(12, rnorm(nrow(my_data_filtered[, -(1:2)]), sd = 0.0001)), 1, match11)
# 
# 
# predicted_classes=predict(rf_model, newdata=data.frame(t(profile_features)), type = 'prob')
#  
# max_match=t(apply(predicted_classes, 1, function(x) c(max(x), which.max(x))) )

Visualizing all integration profiles

Appendix: plotting all profiles

source("generate_all_profiles.R")
generate_all_profiles()